library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.5     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(corrplot)
## corrplot 0.84 loaded
library("PerformanceAnalytics")
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
library(gtsummary)
## #Uighur
library(flextable)
## 
## Attaching package: 'flextable'
## The following object is masked from 'package:gtsummary':
## 
##     as_flextable
## The following object is masked from 'package:purrr':
## 
##     compose
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1
library(e1071)
## 
## Attaching package: 'e1071'
## The following objects are masked from 'package:PerformanceAnalytics':
## 
##     kurtosis, skewness
food_data <- read_csv("Food_Supply_kcal_Data.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   Country = col_character(),
##   Undernourished = col_character(),
##   `Unit (all except Population)` = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
head(food_data)
## # A tibble: 6 x 32
##   Country `Alcoholic Beve… `Animal Product… `Animal fats` `Aquatic Produc…
##   <chr>              <dbl>            <dbl>         <dbl>            <dbl>
## 1 Afghan…           0                  4.78         0.850                0
## 2 Albania           0.912             16.1          1.06                 0
## 3 Algeria           0.0896             6.03         0.194                0
## 4 Angola            1.94               4.69         0.264                0
## 5 Antigu…           2.30              15.4          1.54                 0
## 6 Argent…           1.44              15.0          1.06                 0
## # … with 27 more variables: `Cereals - Excluding Beer` <dbl>, Eggs <dbl>,
## #   `Fish, Seafood` <dbl>, `Fruits - Excluding Wine` <dbl>, Meat <dbl>, `Milk -
## #   Excluding Butter` <dbl>, Miscellaneous <dbl>, Offals <dbl>, Oilcrops <dbl>,
## #   Pulses <dbl>, Spices <dbl>, `Starchy Roots` <dbl>, Stimulants <dbl>, `Sugar
## #   Crops` <dbl>, `Sugar & Sweeteners` <dbl>, Treenuts <dbl>, `Vegetal
## #   Products` <dbl>, `Vegetable Oils` <dbl>, Vegetables <dbl>, Obesity <dbl>,
## #   Undernourished <chr>, Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## #   Active <dbl>, Population <dbl>, `Unit (all except Population)` <chr>
sum(food_data[1,2:24])
## [1] 100.0002
food_data <- food_data  %>%
  drop_na() %>%
  mutate(Confirmed_per_capita = Confirmed/Population * 100000000, Deaths_per_capita = Deaths/Population * 100000000) %>%
  #creating a column that specifies whether each country falls above or below the median
  mutate(Cases_vs_median = as.character(ifelse(Confirmed_per_capita > median(Confirmed_per_capita), "Above", "Below")), 
         Deaths_vs_median = as.character(ifelse(Deaths_per_capita > median(Deaths_per_capita), "Above", "Below")))

food_data <- food_data %>%
  select(-Miscellaneous, -Obesity, -Undernourished, -Recovered, -Active, -Population, -`Unit (all except Population)`, 
         -Confirmed, -Deaths)
head(food_data)
## # A tibble: 6 x 27
##   Country `Alcoholic Beve… `Animal Product… `Animal fats` `Aquatic Produc…
##   <chr>              <dbl>            <dbl>         <dbl>            <dbl>
## 1 Afghan…           0                  4.78         0.850                0
## 2 Albania           0.912             16.1          1.06                 0
## 3 Algeria           0.0896             6.03         0.194                0
## 4 Angola            1.94               4.69         0.264                0
## 5 Argent…           1.44              15.0          1.06                 0
## 6 Armenia           0.227             12.8          1.77                 0
## # … with 22 more variables: `Cereals - Excluding Beer` <dbl>, Eggs <dbl>,
## #   `Fish, Seafood` <dbl>, `Fruits - Excluding Wine` <dbl>, Meat <dbl>, `Milk -
## #   Excluding Butter` <dbl>, Offals <dbl>, Oilcrops <dbl>, Pulses <dbl>,
## #   Spices <dbl>, `Starchy Roots` <dbl>, Stimulants <dbl>, `Sugar Crops` <dbl>,
## #   `Sugar & Sweeteners` <dbl>, Treenuts <dbl>, `Vegetal Products` <dbl>,
## #   `Vegetable Oils` <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## #   Deaths_per_capita <dbl>, Cases_vs_median <chr>, Deaths_vs_median <chr>
corrplot(is.corr=TRUE, cor(food_data[,2:24], use="pairwise.complete.obs"), method="number")

chart.Correlation(food_data[,2:24], histogram=TRUE)

# summary statistics
food_data_1 <- food_data %>%
  select(-Deaths_vs_median, -Country)
tbl_summary(data = food_data_1, by = "Cases_vs_median",
            statistic = list(all_continuous() ~ "{mean} ({sd})",
                             all_categorical() ~ "{n}({p}%)")) %>%
  add_n() %>%
  add_p(test = all_continuous() ~ "aov") %>%
  as_flex_table() %>%
  bold(part = "header")
food_data_2 <- food_data %>%
  select(-Cases_vs_median, -Country)
tbl_summary(data = food_data_2, by = "Deaths_vs_median",
            statistic = list(all_continuous() ~ "{mean} ({sd})",
                             all_categorical() ~ "{n}({p}%)")) %>%
  add_n() %>%
  add_p(test = all_continuous() ~ "aov") %>%
  as_flex_table() %>%
  bold(part = "header")
# prepare data for cross validation
food_data <- food_data %>%
  # delete country since only each country has only one observation
  select(-`Animal Products`, -Country)
food_data %>% head()
## # A tibble: 6 x 25
##   `Alcoholic Beve… `Animal fats` `Aquatic Produc… `Cereals - Excl…   Eggs
##              <dbl>         <dbl>            <dbl>            <dbl>  <dbl>
## 1           0              0.850                0             37.1 0.150 
## 2           0.912          1.06                 0             16.2 0.809 
## 3           0.0896         0.194                0             25.0 0.418 
## 4           1.94           0.264                0             18.4 0.0441
## 5           1.44           1.06                 0             16.8 0.864 
## 6           0.227          1.77                 0             19.3 0.731 
## # … with 20 more variables: `Fish, Seafood` <dbl>, `Fruits - Excluding
## #   Wine` <dbl>, Meat <dbl>, `Milk - Excluding Butter` <dbl>, Offals <dbl>,
## #   Oilcrops <dbl>, Pulses <dbl>, Spices <dbl>, `Starchy Roots` <dbl>,
## #   Stimulants <dbl>, `Sugar Crops` <dbl>, `Sugar & Sweeteners` <dbl>,
## #   Treenuts <dbl>, `Vegetal Products` <dbl>, `Vegetable Oils` <dbl>,
## #   Vegetables <dbl>, Confirmed_per_capita <dbl>, Deaths_per_capita <dbl>,
## #   Cases_vs_median <chr>, Deaths_vs_median <chr>
# numeric 
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Confirmed_per_capita, k=10)
result_lm <- list()
result_knn <- list()
result_lasso <- list()
result_ridge <- list()
      for(f in 1:length(cv_folds_10)){
        food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
        food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
        lm_fit = lm(Confirmed_per_capita~.-Deaths_per_capita-Cases_vs_median-Deaths_vs_median, data = food_data_train_cv_10)
        knn_fit = train(Confirmed_per_capita~.-Deaths_per_capita-Cases_vs_median-Deaths_vs_median, data = food_data_train_cv_10, 
                         method = "knn", tuneLength = 20, preProcess = c("scale", "center"),
                        trControl = trainControl(method="cv", number = 5))
        lasso_fit = cv.glmnet(x = data.matrix(food_data[,c(1:21)]), y = data.matrix(food_data[,22]), alpha=1)
        ridge_fit = cv.glmnet(x = data.matrix(food_data[,c(1:21)]), y = data.matrix(food_data[,22]), alpha=0)
        
        #linear
        food_data_test_cv_10$Confirmed_lm_predict <- predict(lm_fit, newdata = food_data_test_cv_10)
        result_lm[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_lm_predict)^2, na.rm=TRUE)
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_lm_predict)
        #knn
        food_data_test_cv_10$Confirmed_knn_predict <- predict(knn_fit, newdata = food_data_test_cv_10)
        result_knn[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_knn_predict)^2, na.rm=TRUE)
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_knn_predict)
        #lasso
        food_data_test_cv_10 = as.data.frame(food_data_test_cv_10)
        food_data_test_cv_10$Confirmed_lasso_predict <- as.matrix(cbind(1,food_data_test_cv_10[,c(1:21)])) %*% coef(lasso_fit, s = "lambda.min")
        result_lasso[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_lasso_predict)^2, na.rm=TRUE)
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_lasso_predict)        
        #ridge
        food_data_test_cv_10$Confirmed_ridge_predict <- as.matrix(cbind(1,food_data_test_cv_10[,c(1:21)])) %*% coef(ridge_fit, s = "lambda.min")
        result_ridge[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_ridge_predict)^2, na.rm=TRUE)
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_ridge_predict)
      }

data.frame_lm = data.frame("CV_error" = mean(unlist(result_lm)), "CV_error_SE" = sd(unlist(result_lm)))
data.frame_knn = data.frame("CV_error" = mean(unlist(result_knn)), "CV_error_SE" = sd(unlist(result_knn)))
data.frame_lasso = data.frame("CV_error" = mean(unlist(result_lasso)), "CV_error_SE" = sd(unlist(result_lasso)))
data.frame_ridge = data.frame("CV_error" = mean(unlist(result_ridge)), "CV_error_SE" = sd(unlist(result_ridge)))
rbind("Linear Regression" = data.frame_lm, "KNN" = data.frame_knn, "Lasso" = data.frame_lasso, "Ridge" = data.frame_ridge)
##                   CV_error CV_error_SE
## Linear Regression 61949.59    83274.45
## KNN               38668.35    43959.12
## Lasso             26944.69    29722.09
## Ridge             34409.81    40599.06
# categorical
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Cases_vs_median, k=10)
result_logistic <- list()
result_knn <- list()
result_linear_svm <- list()
result_radial_svm <- list()
result_polynomial_svm <- list()
food_data %>% head()
## # A tibble: 6 x 25
##   `Alcoholic Beve… `Animal fats` `Aquatic Produc… `Cereals - Excl…   Eggs
##              <dbl>         <dbl>            <dbl>            <dbl>  <dbl>
## 1           0              0.850                0             37.1 0.150 
## 2           0.912          1.06                 0             16.2 0.809 
## 3           0.0896         0.194                0             25.0 0.418 
## 4           1.94           0.264                0             18.4 0.0441
## 5           1.44           1.06                 0             16.8 0.864 
## 6           0.227          1.77                 0             19.3 0.731 
## # … with 20 more variables: `Fish, Seafood` <dbl>, `Fruits - Excluding
## #   Wine` <dbl>, Meat <dbl>, `Milk - Excluding Butter` <dbl>, Offals <dbl>,
## #   Oilcrops <dbl>, Pulses <dbl>, Spices <dbl>, `Starchy Roots` <dbl>,
## #   Stimulants <dbl>, `Sugar Crops` <dbl>, `Sugar & Sweeteners` <dbl>,
## #   Treenuts <dbl>, `Vegetal Products` <dbl>, `Vegetable Oils` <dbl>,
## #   Vegetables <dbl>, Confirmed_per_capita <dbl>, Deaths_per_capita <dbl>,
## #   Cases_vs_median <chr>, Deaths_vs_median <chr>
food_data <- food_data %>%
  mutate(Cases_vs_median = as.factor(ifelse(Cases_vs_median == "Above",1,0)),
         Deaths_vs_median = as.factor(ifelse(Deaths_vs_median == "Above",1,0)))

      for(f in 1:length(cv_folds_10)){
        food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
        food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
        knn_fit <- train(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, 
                         method = "knn", tuneLength = 20, preProcess = c("scale", "center"),
                        trControl = trainControl(method="cv", number = 5))
        logistic_fit <- glm(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, family = binomial())
        svr_linear_tune <- tune(svm, Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, kernel ="linear")
        linear_svm_fit <- svr_linear_tune$best.model
        svr_radial_tune <- tune(svm, Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, kernel ="radial")
        radial_svm_fit <- svr_radial_tune$best.model
        svr_polynomial_tune <- tune(svm, Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, kernel ="polynomial")
        polynomial_svm_fit <- svr_polynomial_tune$best.model

        # knn
        food_data_test_cv_10$Confirmed_knn_predict <- predict(knn_fit, newdata = food_data_test_cv_10)
        result_knn[[f]] <- 1-confusionMatrix(data = food_data_test_cv_10$Confirmed_knn_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_knn_predict)
        #logistic
        food_data_test_cv_10$Confirmed_logistic_predict_num <- predict(logistic_fit, newdata = food_data_test_cv_10, type="response", na.action=na.exclude)
        food_data_test_cv_10$Confirmed_logistic_predict <- as_factor(ifelse(food_data_test_cv_10$Confirmed_logistic_predict_num >= 0.5, 1, 0))
        result_logistic[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_logistic_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_logistic_predict, -Confirmed_logistic_predict_num )
        # linear svm
        food_data_test_cv_10$Confirmed_linear_svm_predict <- predict(linear_svm_fit, newdata = food_data_test_cv_10)
        result_linear_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_linear_svm_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_linear_svm_predict)
        # radial svm
        food_data_test_cv_10$Confirmed_radial_svm_predict <- predict(radial_svm_fit, newdata = food_data_test_cv_10)
        result_radial_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_radial_svm_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_radial_svm_predict)
        # polynomial svm
        food_data_test_cv_10$Confirmed_polynomial_svm_predict <- predict(polynomial_svm_fit, newdata = food_data_test_cv_10)
        result_polynomial_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Confirmed_polynomial_svm_predict, reference = as.factor(food_data_test_cv_10$Cases_vs_median))$overall[1]
        food_data_test_cv_10 <- food_data_test_cv_10 %>%
          select(-Confirmed_polynomial_svm_predict)
      }
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
data.frame_knn = data.frame("CV_error" = mean(unlist(result_knn)), "CV_error_SE" = sd(unlist(result_knn)))
data.frame_logistic = data.frame("CV_error" = mean(unlist(result_logistic)), "CV_error_SE" = sd(unlist(result_logistic)))
data.frame_linear_svm = data.frame("CV_error" = mean(unlist(result_linear_svm)), "CV_error_SE" = sd(unlist(result_linear_svm)))
data.frame_radial_svm = data.frame("CV_error" = mean(unlist(result_radial_svm)), "CV_error_SE" = sd(unlist(result_radial_svm)))
data.frame_polynomial_svm = data.frame("CV_error" = mean(unlist(result_polynomial_svm)), "CV_error_SE" = sd(unlist(result_polynomial_svm)))

rbind("KNN" = data.frame_knn, "Logistic Regression" = data.frame_logistic, "Linear SVM" = data.frame_linear_svm, "Radial SVM" = data.frame_radial_svm, "Polynomial SVM" = data.frame_polynomial_svm)
##                      CV_error CV_error_SE
## KNN                 0.2301190  0.09472007
## Logistic Regression 0.2154167  0.08665864
## Linear SVM          0.2016667  0.08239447
## Radial SVM          0.2417857  0.07898161
## Polynomial SVM      0.2536905  0.08373853
    lasso_fit = cv.glmnet(x = data.matrix(food_data[,c(1:21)]), y = data.matrix(food_data[,24]), alpha=1)
    ridge_fit = cv.glmnet(x = data.matrix(food_data[,c(1:21)]), y = data.matrix(food_data[,24]), alpha=0)

qda_fit = train(Cases_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Deaths_vs_median, data = food_data_train_cv_10, method = “qda”)

    #lasso
    food_data_test_cv_10 = as.data.frame(food_data_test_cv_10)
    food_data_test_cv_10$Confirmed_lasso_predict <- as.matrix(cbind(1,food_data_test_cv_10[,c(1:21)])) %*% coef(lasso_fit, s = "lambda.min")
    result_lasso[[f]] = mean((food_data_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_lasso_predict)^2, na.rm=TRUE)
    food_data_test_cv_10 <- food_data_test_cv_10 %>%
      select(-Confirmed_lasso_predict)
    #ridge
    food_data_test_cv_10$Confirmed_ridge_predict <- as.matrix(cbind(1,food_data_test_cv_10[,c(1:21)])) %*% coef(ridge_fit, s = "lambda.min")
    result_ridge[[f]] = mean((kcal_test_cv_10$Confirmed_per_capita - food_data_test_cv_10$Confirmed_ridge_predict)^2, na.rm=TRUE)
    food_data_test_cv_10 <- food_data_test_cv_10 %>%
      select(-Confirmed_ridge_predict)
      

data.frame_lasso = data.frame(“CV_error” = mean(unlist(result_lasso)), “CV_error_SE” = sd(unlist(result_lasso))) data.frame_ridge = data.frame(“CV_error” = mean(unlist(result_ridge)), “CV_error_SE” = sd(unlist(result_ridge)))